#################################################
##   Load the data                          #####
#################################################


setwd(wd_data)
data = read.csv('main_dataset.csv')
data = data[!is.na(data$sruT),]

# The excess public housing share, compared to the target. if >0, no policy
data$running_hlm = ifelse(data$policy_period<=3, round(data$hlm_pre_p, 2) - .20, round(data$hlm_pre_p, 2) - .25 )

# Treatment post "postThreshold" defined as passing the pop threshold
data = data %>% group_by(CODGEO) %>% mutate(postThreshold = ifelse(running > 0, 1, 0))

# Define period when obs passes the threshold
data = data %>%
  arrange(CODGEO, year, policy_period) %>%
  group_by(CODGEO) %>%
  mutate(match = c(NA, diff(postThreshold))) %>% # this identifies changes from 0 to 1
  mutate(threshold_period = ifelse(match == 1, policy_period, NA))  %>% # when this happens, save the period indicator
  tidyr::fill(threshold_period, .direction = "down") %>% # and apply this to the whole group
  tidyr::fill(threshold_period, .direction = "up")

data$match = NULL

# Define individual leads and lags
data$threshold_period_ind = ifelse((is.na(data$threshold_period)), 0, data$policy_period - data$threshold_period)

#################################################
##    Data adjustment                       #####
#################################################

# vote shares
for (var in c('left', 'right', 'extreme.left', 'turnout', 'fn')){
  data[var] = data[var]/100
}

# create share of home ownership
data$res_owned_share = data$num_res_own/data$num_res

# create pre-policy shares
df1 = data[which(data$year == 1999), c('CODGEO', 'hlm_share', 'res_owned_share')]
colnames(df1) = c('CODGEO', 'hlm_share_90', 'res_owned_share_90') # why not calling those 99?
data = merge(data, df1, on='CODGEO', all.x = T)

rm(df1)

#################################################
##   Subset the sample For Diff in Diff     #####
#################################################

# limit to the post policy period - this includes 2002 elections
df_did = data[which(data$policy_period>=0), ] 
df_did = df_did[abs(df_did$running)<bdw,]

# Exclude the always treated: Include only control, or place that are treated after period 1
# This means keeping only municipalities below the population threshold at the beginning.
df_did = df_did[which(df_did$running_pre<=0), ]


#################################################
##   Subset for placebo #####
#################################################

# no urban agglomerations
if (rural){df_did = df_did[which(is.na(df_did$agglo_name)), ] }

# all urban agglomerations with no treated municipality
if (urban){df_did = df_did[which(df_did$agglo_sru==0), ]}

# all urban muncipalities with excess public housing
if (excess){
  df_did = df_did[which(df_did$agglo_sru==1), ]
  df_did = df_did[which(data$running_hlm>0), ]}


#################################################


# lags and leads: p0 is the last period before policy application
dummies =  fastDummies::dummy_cols(df_did$threshold_period_ind)[,-1]
colnames(dummies) = c('m1', 'm2', 'm3', 'm4', 'm5', 'p0', 'p1', 'p2', 'p3')
df_did = cbind(df_did, dummies)
rm(dummies)


#################################################
##   quantiles                              #####
#################################################


# immigration 1999 quantiles
df_did$imm_quant_99 = cut(df_did$imm_share_99,
                          breaks=c(quantile(df_did$imm_share_99, probs = seq(0, 1, by = 0.33333), na.rm=T)),
                          labels=c("1","2","3"), include.lowest=T)
